{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b84e1e81",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e046a7b42826456cbca8e529e75083b4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "56cb68b2f76541eca249234c05bfc1cd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "data/train-00000-of-00002.parquet:   0%|          | 0.00/159M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "153f5fd942b24a238d26d987361f8959",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "data/train-00001-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from huggingface_hub import snapshot_download\n",
    "\n",
    "f = snapshot_download(\n",
    "    repo_id=\"mesolitica/Emilia-Mandarin-Description-Instructions\",\n",
    "    repo_type='dataset',\n",
    "    allow_patterns=\"data/*.parquet\",\n",
    "    local_dir=\"./Emilia-Mandarin-Description-Instructions\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "acda3802",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "from tqdm import tqdm\n",
    "from multiprocess import Pool\n",
    "import itertools\n",
    "import zipfile\n",
    "import os\n",
    "\n",
    "def chunks(l, n):\n",
    "    for i in range(0, len(l), n):\n",
    "        yield (l[i: i + n], i // n)\n",
    "\n",
    "\n",
    "def multiprocessing(strings, function, cores=6, returned=True):\n",
    "    df_split = chunks(strings, len(strings) // cores)\n",
    "    pool = Pool(cores)\n",
    "    pooled = pool.map(function, df_split)\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "\n",
    "    if returned:\n",
    "        return list(itertools.chain(*pooled))\n",
    "\n",
    "def loop(files):\n",
    "    files, _ = files\n",
    "    for zip_file_path in tqdm(files):\n",
    "        destination_folder = './'\n",
    "        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:\n",
    "            zip_ref.extractall(destination_folder)\n",
    "        os.remove(zip_file_path)\n",
    "\n",
    "# files = glob('*.zip')\n",
    "# if len(files):\n",
    "#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "356cfc51",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/lib/python3/dist-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.17.3 and <1.25.0 is required for this version of SciPy (detected version 1.26.4\n",
      "  warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoProcessor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "05a611b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "processor = AutoProcessor.from_pretrained(\"Qwen/Qwen2-Audio-7B-Instruct\")\n",
    "tokenizer = processor.tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3f73f0fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import glob\n",
    "import pandas as pd\n",
    "\n",
    "rows = []\n",
    "\n",
    "files = glob('Emilia-Mandarin-Description-Instructions/data/*.parquet')\n",
    "for f in files:\n",
    "    df = pd.read_parquet(f).to_dict(orient = 'records')\n",
    "    rows.extend(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "07a8ff8d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "292649"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(rows)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "1f0fe0c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "def loop(rows):\n",
    "    rows, _ = rows\n",
    "    data = []\n",
    "    for r in tqdm(rows):\n",
    "        f = r['audio_filename']\n",
    "        if not os.path.exists(f):\n",
    "            continue\n",
    "            \n",
    "        try:\n",
    "            conversation = [\n",
    "                {\"role\": \"user\", \"content\": [\n",
    "                    {\"type\": \"audio\", \"audio_url\": \"audio.wav\"},\n",
    "                    {\"type\": \"text\", \"text\": r['question']},\n",
    "                ]},\n",
    "                {\"role\": \"assistant\", \"content\": r['answer']},\n",
    "            ]\n",
    "            text = processor.apply_chat_template(conversation, tokenize=False)\n",
    "        except Exception as e:\n",
    "            continue\n",
    "        \n",
    "\n",
    "        data.append({\n",
    "            'text': text,\n",
    "            'audio': f,\n",
    "        })\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "b667c078",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5578.95it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5599.02it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5621.83it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5612.75it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5575.41it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5591.80it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5585.33it/s]\n",
      " 73%|███████████████████████████████████████████████████████████                      | 7115/9754 [00:01<00:00, 5387.39it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5587.48it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5436.90it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5596.09it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5278.49it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5612.71it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5601.26it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5537.53it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5607.52it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5557.40it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5594.49it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 4919.75it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5634.98it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5563.21it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5627.04it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5334.89it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 5405.51it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5614.07it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5511.50it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5381.96it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5240.95it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5619.12it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5628.40it/s]\n",
      "100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5645.96it/s]\n"
     ]
    }
   ],
   "source": [
    "processed = multiprocessing(rows, loop, cores = 30)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b7e7d3c5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "292649"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(processed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e30cd7eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('prepare-Emilia-Mandarin-Description-Instructions.json', 'w') as fopen:\n",
    "    json.dump(processed, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "a5971c69",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'text': '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n<|im_start|>user\\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\\ncan you describe the audio<|im_end|>\\n<|im_start|>assistant\\nThe speaker is discussing the role of Baidu, a prominent Chinese internet company, in collecting user data through its products. The statement highlights how Baidu can directly access and utilize user information and habits through the various products it offers.\\n\\nHere’s a breakdown of the key points:\\n\\n1. **Products as Data Collection Tools**: The phrase \"依靠这些产品\" (relying on these products) suggests that Baidu\\'s products are the primary means through which the company gathers user data. These products could include search engines, maps, news apps, and other services that users interact with regularly.\\n\\n2. **Direct Access to User Information**: \"可以直接掌握用户资讯\" (can directly grasp user information) indicates that Baidu has immediate and direct access to the data generated by users when they use these products. This includes personal details, search queries, location data, and more.\\n\\n3. **Understanding User Habits**: \"获取习惯\" (acquire habits) emphasizes that Baidu not only collects raw data but also analyzes this data to understand user behaviors and preferences. This understanding allows the company to tailor its services and advertising more effectively.\\n\\nThe overall tone is matter-of-fact and informative, providing a clear explanation of how Baidu leverages its products for data collection and analysis. This information is crucial for understanding the company\\'s business model and its capabilities in the digital ecosystem.<|im_end|>\\n',\n",
       " 'audio': 'ZH/ZH_B00001_S08611_W000020.mp3'}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
